import sys
sys.path.insert(0, '../src/')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from datetime import date
import geopandas as gpd
from IPython.display import display, HTML
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import lag_plot
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.ar_model import AR
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from utils import load_pkl, generate_times
import seaborn as sns; sns.set()
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from metrics import *
from preprocessing import normalize
import tqdm as tqdm
from tqdm.auto import tqdm
tqdm.pandas()
# Imports classes
from Baseline import *
from Regressor import *
from utils import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
contour_iris = gpd.read_file(
'../datasets/iris/iris.shp')
convert_to_int = ['dep', 'insee_com', 'iris', 'code_iris']
for col in convert_to_int:
contour_iris[col] = contour_iris[col].astype(int)
contour_iris = contour_iris[['code_iris', 'geometry', 'dep']]
contour_iris.head();
station_data = pd.read_csv("../datasets/station_to_iris.csv")
station_data.describe();
stations_mode = load_pkl("../datasets/stations_mode.pkl")
subway_stations = [k for k, v in stations_mode.items() if v == 3]
print("Number of Subway stations: {}".format(len(subway_stations)))
Subways stations with less than $80000$ validations per $3$ month. Note that this is before we normalize the data. In the article, they removed $3$ subways stations, assuming that it was closed for renovation work. We printed below the $4$ stations with smaller number of validations.
station_data[(station_data['id'].isin(subway_stations)) & (station_data['validations_count'] < 80000)];
dates = pd.date_range(start="2015-10-01", end="2015-12-31").date
matrix_6h = np.load("../datasets/6h_matrix.npy")
matrix_2h = np.load("../datasets/2h_matrix.npy")
matrix_15m = np.load("../datasets/15m_matrix.npy")
f, ax = plt.subplots(1, figsize=(16, 12))
ax = contour_iris[contour_iris['dep'].isin([75, 92, 93, 94])].plot(
ax=ax, edgecolor='black', column='dep', cmap='icefire_r')
ax.scatter(station_data[station_data['id'].isin(subway_stations)]['x'],
station_data[station_data['id'].isin(subway_stations)]['y'], color='firebrick', label='Subway Stations')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('Subway Stations in ÃŽle de France')
ax.legend()
plt.show();
Below we apply Min Max Normalization to data, with a scale range of $[0, 1]$.
data_matrix_6h = pd.Panel(normalize(matrix_6h),
items=dates,
major_axis=subway_stations,
minor_axis=generate_times("6h")
)
data_matrix_2h = pd.Panel(normalize(matrix_2h),
items=dates,
major_axis=subway_stations,
minor_axis=generate_times("2h")
)
data_matrix_15m_complete = pd.Panel(matrix_15m,
items=dates,
major_axis=subway_stations,
minor_axis=generate_times("15min")
)
Delete the first $4$ hours, from $00.00.00$ to $04.00.00$ because it's useless, the number of validations in that range is mostly equal to 0.
del_hours = 4
data_matrix_15m = data_matrix_15m_complete.iloc[:, :, del_hours*4:]
data_matrix_15m.to_frame().head()
dmatrix_mean_6h = data_matrix_6h.mean()
dmatrix_mean_2h = data_matrix_2h.mean()
dmatrix_mean_15m = data_matrix_15m.mean()
dtmatrix_mean_6h = dmatrix_mean_6h.transpose()
dtmatrix_mean_2h = dmatrix_mean_2h.transpose()
dtmatrix_mean_15m = dmatrix_mean_15m.transpose()
Again, this is another way to print the stations with a small number of validations.
data_matrix_15m.mean(axis=0)[data_matrix_15m.mean(axis=0).sum(axis=1) < 810];
dmatrix_mean_15m.head()
dtmatrix_mean_15m.head()
f, ax = plt.subplots(3, figsize=(16, 12))
ax1 = dtmatrix_mean_15m.plot(ax=ax[0], legend=False)
ax1.set_xticklabels([])
ax1.set_ylabel('Number of Validations')
ax1.set_title('15min')
ax2 = dtmatrix_mean_2h.plot(ax=ax[1])
ax2.set_xticklabels([])
ax2.set_ylabel('Number of Validations')
ax2.set_title('2h')
ax2.legend(bbox_to_anchor=(1., 1.01))
ax3 = dtmatrix_mean_6h.plot(ax=ax[2])
ax3.set_xlabel('Days')
ax3.set_ylabel('Number of Validations')
ax3.set_title('6h')
ax3.legend(bbox_to_anchor=(1., 1.01))
plt.xticks(rotation=90)
plt.show();
f, ax = plt.subplots(3, figsize=(16, 12))
ax1 = dtmatrix_mean_15m.plot.area(ax=ax[0], legend=False)
ax1.set_xticklabels([])
ax1.set_ylabel('Time')
ax1.set_title('15min')
ax2 = dtmatrix_mean_2h.plot.area(ax=ax[1])
ax2.set_xticklabels([])
ax2.set_ylabel('Time')
ax2.set_title('2h')
ax2.legend(bbox_to_anchor=(1., 1.01))
ax3 = dtmatrix_mean_6h.plot.area(ax=ax[2])
ax3.set_xlabel('Days')
ax3.set_ylabel('Time')
ax3.set_title('6h')
ax3.legend(bbox_to_anchor=(1., 1.01), loc=2)
plt.xticks(rotation=90)
plt.show();
fig = plt.figure(figsize=(16, 6))
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0])
dmatrix_mean_15m.plot(ax=ax, legend=False)
plt.ylabel('Number of Validations')
plt.title('15min')
plt.xticks(rotation=90)
plt.show();
f, ax = plt.subplots(3, figsize=(16, 12))
ax1 = dmatrix_mean_15m.iloc[:, :31].plot(ax=ax[0])
ax1.set_xticklabels([])
ax1.set_ylabel('Days')
ax1.set_title('October\'s number of validations')
ax1.legend(bbox_to_anchor=(1., 0.9, 1.1, .102), ncol=2, loc=2,
borderaxespad=0.)
ax2 = dmatrix_mean_15m.iloc[:, 31:61].plot(ax=ax[1])
ax2.set_xticklabels([])
ax2.set_ylabel('Days')
ax2.set_title('November\'s number of validations')
ax2.legend(bbox_to_anchor=(1., 0.9, 1.1, .102), ncol=2, loc=2,
borderaxespad=0.)
ax3 = dmatrix_mean_15m.iloc[:, 61:].plot(ax=ax[2])
ax3.set_xlabel('Time')
ax3.set_ylabel('Days')
ax3.set_title('December\'s number of validations')
plt.legend(bbox_to_anchor=(1., 0.9, 1.1, .102), loc=2,
ncol=2, borderaxespad=0.)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show();
f, ax = plt.subplots(2, figsize=(16, 12))
ax1 = dtmatrix_mean_15m.boxplot(return_type='both', ax=ax[0])
ax[0].set_xlabel("Time", fontsize=15)
ax[0].set_ylabel("Number of Validations", fontsize=15)
for tick in ax[0].get_xticklabels():
tick.set_rotation(90)
ax2 = dmatrix_mean_15m.boxplot(return_type='both', ax=ax[1])
plt.xticks(rotation=90)
plt.tight_layout()
plt.show();
from __init__ import *
wd_15m = data_matrix_15m.loc[dict_w.values()]
wdm_15m = wd_15m.mean()
wdmt_15m = wdm_15m.transpose()
wd_15mf = data_matrix_15m.loc[dict_wd_final.values()]
wdm_15mf = wd_15mf.mean()
wdmt_15mf = wdm_15mf.transpose()
f, ax = plt.subplots(3, figsize=(16, 12))
ax1 = wdm_15m.loc[:, dict_wd_oct.values()].plot(ax=ax[0])
ax1.set_xticklabels([])
ax1.set_ylabel('Number of Validations')
ax1.set_title('Octobre')
ax1.legend(bbox_to_anchor=(1., 0.9, 1.1, .102), ncol=2, loc=2,
borderaxespad=0.)
ax2 = wdm_15m.loc[:, dict_wd_nov.values()].plot(ax=ax[1])
ax2.set_xticklabels([])
ax2.set_ylabel('Number of Validations')
ax2.set_title('Novembre')
ax2.legend(bbox_to_anchor=(1., 0.9, 1.1, .102), ncol=2, loc=2,
borderaxespad=0.)
ax3 = wdm_15m.loc[:, dict_wd_dec.values()].plot(ax=ax[2])
ax3.set_xlabel('Time')
ax3.set_ylabel('Number of Validations')
ax3.set_title('Decembre')
plt.legend(bbox_to_anchor=(1., 0.9, 1.1, .102), loc=2,
ncol=2, borderaxespad=0.)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show();
f, ax = plt.subplots(2, figsize=(16, 8))
ax1 = wdm_15mf.loc[:, dict_wd_novf.values()].plot(ax=ax[0])
ax1.set_xticks([])
ax1.set_ylabel('Number of Validations')
ax1.set_title('Novembre')
ax1.legend(bbox_to_anchor=(1., 0.9, 1.1, .102), ncol=2, loc=2,
borderaxespad=0.)
ax2 = wdm_15mf.loc[:, dict_wd_decf.values()].plot(ax=ax[1])
ax2.set_xlabel('Time')
ax2.set_ylabel('Number of Validations')
ax2.set_title('Decembre')
plt.legend(bbox_to_anchor=(1., 0.9, 1.1, .102), loc=2,
ncol=2, borderaxespad=0.)
plt.tight_layout()
plt.show();
f, ax = plt.subplots(2, figsize=(16, 12))
ax1 = wdmt_15mf.boxplot(return_type='both', ax=ax[0])
ax[0].set_xlabel("Time", fontsize=15)
ax[0].set_ylabel("Number of Validations", fontsize=15)
for tick in ax[0].get_xticklabels():
tick.set_rotation(90)
ax2 = wdm_15mf.boxplot(return_type='both', ax=ax[1])
plt.xticks(rotation=90)
plt.tight_layout()
plt.show();
fig, (ax1, ax2) = plt.subplots(2, figsize=(16, 12))
wdm_15mf.plot(ax=ax1, legend=False)
ax1.set_ylabel('Number of Validations'); ax1.set_title('15min')
ax2 = wdmt_15mf.plot(ax=ax2, legend=False)
ax2.set_ylabel('Number of Validations'); ax2.set_title('15min')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show();
fig, (ax1, ax2) = plt.subplots(2, figsize=(16, 12))
autocorrelation_plot(wdmt_15mf.mean(), ax=ax1, c='blue')
ax1.set_title('15min discretization matrix')
plot_acf(wdmt_15mf.mean(), ax=ax2, c='blue', title='Auto Correlation')
plt.show();
plt.figure(figsize=(16, 7))
lag_plot(wdmt_15mf.mean(), c='blue')
plt.title('Lag plot 15min discretization matrix')
# plot_pacf(wdmt_15mf.mean(), ax=ax[1], c='blue', title='Partial Auto Correlation')
plt.show();
dico = dict_wd
size = 45
X = data_matrix_15m.loc[dico.values()]
Xm = X.mean()
Xmt = Xm.transpose()
kw = list(dico.keys())
np.random.shuffle(kw)
vw = [dico[i] for i in kw]
ind_train = vw[:size]
ind_test = vw[size:]
X_train = X[ind_train]
X_test = X[ind_test]
X_train
X_test
def baseline_plot_results(levels):
"""
"""
baseline_scores = []
baseline_preds = []
for level in levels:
b = Baseline(level=level, first_ndays=5)
b.fit(X_train)
baseline_preds.append(b.predict(X_test))
baseline_scores.append(b.score(X_test))
df_baseline_scores = pd.DataFrame(np.array(baseline_scores).T,
index=['R2', 'RMSE', 'MSE', 'MAE', 'MAPE', 'MPE'],
columns=levels)
display(HTML(df_baseline_scores.to_html()))
pd.DataFrame(df_baseline_scores.loc['RMSE'].values.repeat(4).reshape(-1, 4).T,
columns=levels).plot(figsize=(16, 4), kind='line');
return df_baseline_scores, baseline_preds
levels = ["None", "s"]
df_baseline_scores, baseline_preds = baseline_plot_results(levels)
from cost_functions import mse, mse_g
from sklearn.linear_model import LinearRegression, Lasso
class myAR(Regressor):
def __init__(self, order=4, level=None, loss=mse, loss_g=mse_g, max_iter=1000,
eps=0.01):
""" Initialisation des paramètres du perceptron
:param order: Taille de la fenêtre glissante
:param loss: fonction de coût
:param loss_g: gradient de la fonction coût
:param max_iter: nombre maximum d'itération de la fonction coût
:param eps: pas du gradient
"""
self.order = order
self.level = level
self.max_iter, self.eps = max_iter, eps
self.loss, self.loss_g = loss, loss_g
self.w = np.random.random(self.order)
@Regressor.datax_decorator
def analytic_fit(self, datax):
""" Finds the optimal weigths analytically
:param datax: contient tous les exemples du dataset
:returns: void
:rtype: None
"""
self.reg = LinearRegression()
_, self.X, self.y = datax
A, B = self.X.T.dot(self.X), self.X.T.dot(self.y)
self.w1 = np.linalg.solve(A, B).ravel()
self.reg.fit(self.X, self.y)
self.w = self.reg.coef_.squeeze()
display(HTML(pd.DataFrame(self.w.reshape(1, -1), index=['Weights'],
columns=range(1, len(self.w)+1)).to_html()))
return self
def minibatch_fit(self, datax):
""" Mini-Batch gradient descent Learning
:param datax: contient tous les exemples du dataset
"""
for _ in range(self.max_iter):
for d in range(datax.shape[0]):
for t in range(datax.shape[2] - self.order):
batchx = datax.iloc[d, :, t:t + self.order].values
batchy = datax.iloc[d, :, t + self.order].values
self.w -= (self.eps * self.loss_g(batchx, batchy, self.w))
def predict(self, datax):
""" Predict labels
:param datax: contient tous les exemples du dataset
:returns: predicted labels
:rtype: numpy array
"""
y_pred = []
for d in range(datax.shape[0]):
y_pred.append([])
for t in range(datax.shape[2] - self.order):
batchx = datax.iloc[d, :, t:t + self.order].values
y_pred[d].append(batchx.dot(self.w.T))
return np.array(y_pred).transpose(0, 2, 1)
def forecast_n(self, datax):
""" Predict labels
:param datax: contient tous les exemples du dataset
:returns: predicted labels
:rtype: numpy array
"""
y_pred = []
for d in range(datax.shape[0]):
y_pred.append([])
batchx = datax.iloc[d, :, 0:self.order].values
for t in range(datax.shape[2] - self.order):
next_y = batchx.dot(self.w.T)
y_pred[d].append(next_y)
batchx = np.hstack(
(batchx[:, 1:], np.array(next_y).reshape(-1, 1)))
return np.array(y_pred).transpose(0, 2, 1)
def transform_batchx(self, batchx, tplus):
"""
"""
if tplus == 1:
return batchx
for _ in range(tplus-1):
next_y = batchx.dot(self.w.T)
if batchx.ndim == 2:
batchx = np.hstack((batchx[:, 1:],
np.array(next_y).reshape(-1, 1)))
elif batchx.ndim == 1:
batchx = np.hstack((batchx[1:], next_y))
return batchx
def forecast(self, datax, tplus=None):
""" Predict labels
:param datax: contient tous les exemples du dataset
:param tplus: if t equal to 2, means predicting what happened at t+2
:returns: predicted labels
:rtype: numpy array
"""
if tplus == None or tplus > self.order:
return self.forecast_n(datax)
else:
y_pred = []
batch_ind = self.order - tplus
if datax.ndim == 3:
for d in range(datax.shape[0]):
y_pred.append([])
# Take the first batch
batchx = datax.iloc[d, :, 0:self.order].values
# Predict till we finish the first round of tplus
for _ in range(tplus):
next_y = batchx.dot(self.w.T)
y_pred[d].append(next_y)
batchx = np.hstack((batchx[:, 1:],
np.array(next_y).reshape(-1, 1)))
# After the first round of tplus, we have to replace some
# predicted values by the real ones and simultaneously
# replace the following columns by t+1,..., tplus
for t in range(1, datax.shape[2] - self.order - tplus + 1):
batchx = self.transform_batchx(
datax.iloc[d, :, t:self.order+t].values, tplus)
next_y = batchx.dot(self.w.T)
# next_y = np.where(next_y < 0, 0, next_y)
y_pred[d].append(next_y)
elif datax.ndim == 2:
# TODO
pass
elif datax.ndim == 1:
batchx = datax.iloc[0:self.order].values
for _ in range(tplus):
next_y = batchx.dot(self.w.T)
y_pred.append(next_y)
batchx = np.hstack((batchx[1:], next_y))
print(datax.shape[0])
for t in range(1, datax.shape[0] - self.order - tplus + 1):
batchx = self.transform_batchx(
datax.iloc[t:self.order+t].values, tplus)
next_y = batchx.dot(self.w.T)
# if next_y < 0: next_y = 0
y_pred.append(next_y)
return np.array(y_pred)
else:
raise ValueError("Untreated datax number of dimensions")
return np.array(y_pred).transpose(0, 2, 1)
@Regressor.datax_decorator
def again(self, datax):
datax, self.X_test, self.y_test = datax
y_pred = self.reg.predict(self.X_test)
y_pred = y_pred.reshape((datax.shape[0] * datax.shape[1],
datax.shape[2] - self.order),
order='F').reshape((datax.shape[0],
datax.shape[1],
datax.shape[2] - self.order))
return y_pred
def reshaped(self, y_pred, datax):
"""
"""
if datax.ndim == 3:
return y_pred.reshape((datax.shape[0] * datax.shape[1],
datax.shape[2] - self.order),
order='F').reshape((datax.shape[0],
datax.shape[1],
datax.shape[2] - self.order))
elif datax.ndim == 2:
return y_pred.reshape((datax.shape[0], datax.shape[1] - self.order),
order='F')
@Regressor.datax_decorator
def forecast(self, datax, tplus):
datax, self.X_test, self.y_test = datax
if tplus == 1:
return self.reshaped(self.reg.predict(self.X_test), datax)
else:
self.X_test = self.X_test.reshape(datax.shape[-1] - self.order, -1, self.order)
tmp = self.X_test[0]
y_pred = self.reg.predict(tmp)
pred = y_pred.copy()
for x in self.X_test[1:]:
x[:, -1] = pred.squeeze()
x[:, -tplus:-1] = tmp[:, -tplus+1:]
tmp = x.copy()
pred = self.reg.predict(tmp)
y_pred = np.vstack((y_pred, pred))
return self.reshaped(y_pred, datax)
@Regressor.datax_decorator
def again_s(self, datax):
datax, self.X_test, self.y_test = datax
y_pred = self.reg.predict(self.X_test)
y_pred = y_pred.reshape((datax.shape[0], datax.shape[1] - self.order),
order='F')
return y_pred
def panelIt(X_pred, X_test, order, subway_stations, del_hours=0):
"""
"""
wd_testorder_15m = X_test.iloc[:, :, order:]
minor_axis = generate_times("15min")[(del_hours * 4) + order:]
return pd.Panel(X_pred,
items=wd_testorder_15m.items,
major_axis=subway_stations,
minor_axis=minor_axis)
class theAR(Baseline):
station_id = 0
def __init__(self, level=None, first_ndays=7, **kwargs):
"""
"""
super().__init__(level, first_ndays)
self.kwargs = kwargs
def fit(self, datax):
"""
"""
if self.level is None:
self.model = myAR(**self.kwargs)
self.model.analytic_fit(datax)
elif self.level.lower() == "s":
self.models = []
datax.apply(lambda station: self.models.append(
myAR(**self.kwargs).analytic_fit(station.T)),
axis=(0, 2))
elif self.level.lower() == "j":
# TODO
self.mean = []
for d in range(self.first_ndays):
exist_ind = list(set(ddict_days[d].values()) & set(datax.items))
self.mean.append(datax[exist_ind].mean().mean(axis=1))
elif self.level.lower() == "sj":
# TODO
self.mean = []
for d in range(self.first_ndays):
exist_ind = list(set(ddict_days[d].values()) & set(datax.items))
self.mean.append(datax[exist_ind].mean(axis=0))
else:
raise ValueError("Unknown value for level attribute, \
try: s, j, sj or None")
def predict(self, datax, tplus=None):
"""
"""
def predict_for_station(x, tplus):
"""
"""
station_pred = self.models[self.station_id].forecast(x, tplus)
self.station_id += 1
return station_pred
if self.level is None:
X_pred = self.model.forecast(datax, tplus)
self.scores = super().metrics_score(
datax.iloc[:, :, self.model.order:], X_pred)
return panelIt(X_pred, datax, self.model.order, subway_stations, del_hours)
elif self.level.lower() == "s":
X_pred = datax.apply(lambda x: predict_for_station(x.T, tplus),
axis=(0, 2)).transpose(1, 0, 2)
self.scores = super().metrics_score(
datax.iloc[:, :, self.models[0].order:], X_pred.values)
self.station_id = 0
return panelIt(X_pred.values, datax, self.models[0].order, subway_stations, del_hours)
elif self.level.lower() == "j":
# TODO
pass
elif self.level.lower() == "sj":
# TODO
pass
else:
raise ValueError("Unknown value for level attribute, \
try: s, j, sj or None")
def score(self, datax):
"""
"""
return self.scores
def ar_plot_results(level, order, limit_t, X_train=X_train, X_test=X_test):
"""
"""
ar_scores = []
ar_preds = []
ar = theAR(level=level, order=order)
print("Fitting...")
ar.fit(X_train)
print("Predicting...")
for t in range(1, limit_t+1):
ar_preds.append(ar.predict(X_test, t))
ar_scores.append(ar.score(X_test))
display(HTML((pd.DataFrame(np.array(ar_scores).T,
index=['R2', 'RMSE', 'MSE', 'MAE', 'MAPE', 'MPE'],
columns=list(map(
lambda x: "t+"+str(x),
range(1, len(ar_scores)+1))))).to_html()))
return ar_preds, ar_scores
def plot_qualitative_analysis(ar_preds, X_test, limit_t, order, subway_stations, del_hours):
"""
"""
fig, ax = plt.subplots(limit_t+1, figsize=(16, limit_t*4))
wd_testorder_15m = X_test.iloc[:, :, order:]
wdm_testorder_15m = wd_testorder_15m.mean()
wdm_testorder_15m.plot(ax=ax[0])
ax[0].set_ylabel('Number of Validations')
ax[0].set_title('Test')
ax[0].legend(bbox_to_anchor=(1., 0.9, 1.1, .102), ncol=2, loc=2,
borderaxespad=0.)
for i in range(limit_t):
pred_t = ar_preds[i].mean()
pred_t.plot(ax=ax[i+1])
ax[i+1].set_ylabel('Number of Validations')
ax[i+1].set_title("Predict t+{}".format(i+1))
ax[i+1].legend(bbox_to_anchor=(1., 0.9, 1.1, .102), ncol=2, loc=2,
borderaxespad=0.)
plt.tight_layout()
plt.show();
def plot_specific(X_test, baseline_preds, ar_preds, ar_preds_s, order, limit_t, j, s):
fig, ax = plt.subplots(limit_t, figsize=(16, limit_t*5))
for t in range(limit_t):
ar_preds[t].iloc[j, s].plot(ax=ax[t], label='General AR')
ar_preds_s[t].iloc[j, s].plot(ax=ax[t], label='AR By Station')
X_test.iloc[j, s].plot(ax=ax[t], label="Real values")
baseline_preds[0].iloc[j, s].plot(ax=ax[t], style=['.--'], label='General Baseline')
baseline_preds[1].iloc[j, s].plot(ax=ax[t], style=['.--'], label='Baseline per station')
ax[t].set_ylabel('Number of Validations')
ax[t].set_title("AR models at t+{} with an order of {}".format(t+1, order))
ax[t].legend(bbox_to_anchor=(1., 0.9, 1.1, .102), ncol=1, loc=2,
borderaxespad=0.)
plt.tight_layout()
plt.show();
order, limit_t = 8, 6
%%time
ar_preds, ar_scores = ar_plot_results(None, order, limit_t)
%%time
ar_preds_s, ar_scores_s = ar_plot_results("s", order, limit_t)
plot_specific(X_test, baseline_preds, ar_preds, ar_preds_s, order, limit_t, j, s)
plot_qualitative_analysis(ar_preds, X_test, limit_t, order, subway_stations, del_hours)
plot_qualitative_analysis(ar_preds_s, X_test, limit_t, order, subway_stations, del_hours)
fig, ax = plt.subplots(1, figsize=(10, 6))
ax.set_prop_cycle(color=['darkgoldenrod', 'brown'])
x = range(1, limit_t+1)
baseline_score = df_baseline_scores.loc['RMSE', 'None'].repeat(limit_t).reshape(-1, limit_t).T
model_score = np.array(ar_scores).T[1]
ax = plt.plot(x, model_score, linewidth=3, label="AR")
ax = plt.scatter(x, model_score, marker='*', s=100)
ax = plt.plot(x, baseline_score, linewidth=3, label="General baseline")
ax = plt.scatter(x, baseline_score, marker='*', s=100)
plt.legend(prop={'size': 20})
plt.title("RMSE of General baseline and AR model, from $t+1$ to $t+{}$".format(limit_t), fontsize=16)
plt.xlabel("T plus", fontsize=16); plt.ylabel("RMSE", fontsize=16);
fig, ax = plt.subplots(1, figsize=(10, 6))
ax.set_prop_cycle(color=['darkgoldenrod', 'brown'])
x = range(1, limit_t+1)
baseline_score = df_baseline_scores.loc['RMSE', 's'].repeat(limit_t).reshape(-1, limit_t).T
model_score = np.array(ar_scores_s).T[1]
ax = plt.plot(x, model_score, linewidth=3, label="AR per station")
ax = plt.scatter(x, model_score, marker='*', s=100)
ax = plt.plot(x, baseline_score, linewidth=3, label="Baseline per station")
ax = plt.scatter(x, baseline_score, marker='*', s=100)
plt.legend(prop={'size': 20})
plt.title("RMSE of baseline and AR model for each station, from $t+1$ to $t+{}$".format(limit_t), fontsize=16)
plt.xlabel("T plus", fontsize=16); plt.ylabel("RMSE", fontsize=16);
fig, ax = plt.subplots(1, figsize=(16, 6))
ax.set_prop_cycle(color=['crimson', 'teal', 'b', 'darkgoldenrod'])
x = range(1, limit_t+1)
baseline_scores = df_baseline_scores.loc['RMSE'].values.repeat(limit_t).reshape(-1, limit_t).T
model_scores = np.vstack((np.array(ar_scores).T[1], np.array(ar_scores_s).T[1])).T
baselineObjects = plt.plot(x, baseline_scores, linewidth=3)
labels = ["General baseline", "Baseline per station", "AR", "AR per station"]
arlineObjects = plt.plot(x, model_scores, linewidth=3)
# ['D', '*', '|', 'X']
# labels = ["Full baseline", "Baseline per station", "Baseline per day",
# "Baseline per station and day", "Full AR", "AR per station"]
for i, m in zip(range(4), ['D', '*']):
ax = plt.scatter(x, baseline_scores[:, i], marker=m, s=100)
for i, m in zip(range(2), ['D', '*']):
ax = plt.scatter(x, model_scores[:, i], marker=m, s=100)
plt.legend(baselineObjects+arlineObjects, labels, prop={'size': 15})
plt.title("RMSE of baseline and AR model for each station, from $t+1$ to $t+{}$".format(limit_t), fontsize=16)
plt.xlabel("T plus", fontsize=16); plt.ylabel("RMSE", fontsize=16);
order, limit_t = 8, 6
def baseline_sub(X, baseline):
return X.apply(lambda x: x - baseline.iloc[0], axis=(1, 2))
def baseline_add(X, baseline):
return X.apply(lambda x: x + baseline.iloc[0], axis=(1, 2))
Xb_train = baseline_sub(X_train, baseline_preds[0])
Xb_test = baseline_sub(X_test, baseline_preds[0])
Xb_train_s = baseline_sub(X_train, baseline_preds[1])
Xb_test_s = baseline_sub(X_test, baseline_preds[1])
%%time
ar_preds, ar_scores = ar_plot_results(None, order, limit_t, X_train=Xb_train, X_test=Xb_test)
for t in range(limit_t):
ar_preds[t] = baseline_add(ar_preds[t], baseline_preds[0])
%%time
ar_preds_s, ar_scores_s = ar_plot_results("s", order, limit_t, X_train=Xb_train_s, X_test=Xb_test_s)
for t in range(limit_t):
ar_preds_s[t] = baseline_add(ar_preds_s[t], baseline_preds[1])
plot_specific(X_test, baseline_preds, ar_preds, ar_preds_s, order, limit_t, j, s)
plot_qualitative_analysis(ar_preds, X_test, limit_t, order, subway_stations, del_hours)
plot_qualitative_analysis(ar_preds_s, X_test, limit_t, order, subway_stations, del_hours)
fig, ax = plt.subplots(1, figsize=(10, 6))
ax.set_prop_cycle(color=['darkgoldenrod', 'brown'])
x = range(1, limit_t+1)
baseline_score = df_baseline_scores.loc['RMSE', 'None'].repeat(limit_t).reshape(-1, limit_t).T
model_score = np.array(ar_scores).T[1]
ax = plt.plot(x, model_score, linewidth=3, label="AR + baseline")
ax = plt.scatter(x, model_score, marker='*', s=100)
ax = plt.plot(x, baseline_score, linewidth=3, label="General baseline")
ax = plt.scatter(x, baseline_score, marker='*', s=100)
plt.legend(prop={'size': 20})
plt.title("RMSE of full baseline and AR model, from $t+1$ to $t+{}$".format(limit_t), fontsize=16)
plt.xlabel("T plus", fontsize=16); plt.ylabel("RMSE", fontsize=16);
fig, ax = plt.subplots(1, figsize=(10, 6))
ax.set_prop_cycle(color=['darkgoldenrod', 'brown'])
x = range(1, limit_t+1)
baseline_score = df_baseline_scores.loc['RMSE', 's'].repeat(limit_t).reshape(-1, limit_t).T
model_score = np.array(ar_scores_s).T[1]
ax = plt.plot(x, model_score, linewidth=3, label="AR per station + baseline")
ax = plt.scatter(x, model_score, marker='*', s=100)
ax = plt.plot(x, baseline_score, linewidth=3, label="Baseline per station")
ax = plt.scatter(x, baseline_score, marker='*', s=100)
plt.legend(prop={'size': 20})
plt.title("RMSE of baseline and AR model for each station, from $t+1$ to $t+{}$".format(limit_t), fontsize=16)
plt.xlabel("T plus", fontsize=16); plt.ylabel("RMSE", fontsize=16);
fig, ax = plt.subplots(1, figsize=(16, 6))
ax.set_prop_cycle(color=['crimson', 'teal', 'b', 'darkgoldenrod'])
x = range(1, limit_t+1)
baseline_scores = df_baseline_scores.loc['RMSE'].values.repeat(limit_t).reshape(-1, limit_t).T
model_scores = np.vstack((np.array(ar_scores).T[1], np.array(ar_scores_s).T[1])).T
baselineObjects = plt.plot(x, baseline_scores, linewidth=3)
labels = ["General baseline", "Baseline per station", "AR + baseline", "AR per station + baseline"]
arlineObjects = plt.plot(x, model_scores, linewidth=3)
for i, m in zip(range(4), ['D', '*']):
ax = plt.scatter(x, baseline_scores[:, i], marker=m, s=100)
for i, m in zip(range(2), ['D', '*']):
ax = plt.scatter(x, model_scores[:, i], marker=m, s=100)
plt.ylim((0, model_scores[:, 1].max()))
plt.legend(baselineObjects+arlineObjects, labels, prop={'size': 15})
plt.title("RMSE of baseline and AR model for each station, from $t+1$ to $t+{}$".format(limit_t), fontsize=16)
plt.xlabel("T plus", fontsize=16); plt.ylabel("RMSE", fontsize=16);